# Load ZTRAX current assessor data
pacman::p_load(data.table, readxl, fst)

source("code/globals.R")


################## Functions that load the Main, Building, BuildingAreas, and Values tables respectively for a given data vintage ######################

readMain <- function(vintageDirectory, rows2load, state) {
  layoutZAsmt <- read_excel(file.path(vintageDirectory, "Layout.xlsx"), sheet = 1)
  col_namesMain <- layoutZAsmt[layoutZAsmt$TableName == "utMain", "FieldName"]
  base <- fread(file.path(vintageDirectory, state, "/ZAsmt/Main.txt"),
    nrows = rows2load,
    sep = "|",
    header = FALSE,
    stringsAsFactors = FALSE,
    # skipNul = TRUE, # tells R to treat two adjacent delimiters as dividing a column
    # comment.char = "", # tells R not to read any symbol as a comment
    quote = "", # this tells R not to read quotation marks as a special symbol
    col.names = col_namesMain$FieldName
  )
  base <- as.data.table(base)
  base <- base[, list(
    RowID, ImportParcelID, LoadID,
    FIPS, State, County,
    AssessorParcelNumber, UnformattedAssessorParcelNumber, ParcelSequenceNumber, ParcelNumberTypeStndCode,
    DupAPN, AlternateParcelNumber, OldParcelNumber,
    PropertyFullStreetAddress,
    PropertyHouseNumber, PropertyHouseNumberExt, PropertyStreetPreDirectional, PropertyStreetName, PropertyStreetSuffix, PropertyStreetPostDirectional,
    PropertyCity, PropertyState, PropertyZip, PropertyZip4,
    PropertyBuildingNumber, PropertyAddressUnitDesignator, PropertyAddressUnitNumber,
    PropertyAddressLatitude, PropertyAddressLongitude, PropertyAddressCensusTractAndBlock,
    NoOfBuildings,
    LotSizeAcres, LotSizeSquareFeet,
    TaxAmount, TaxYear, PropertyGeocodeQualityCode, PropertyAddressQualityCode, OriginalPropertyFullStreetAddress, ExtractDate
  )]

  # Keep only one record for each ImportPropertyID.
  # ImportParcelID is the unique identifier of a parcel. Multiple entries for the same ImportParcelID are due to updated records.
  # The most recent record is identified by the greatest LoadID.
  #   **** This step may not be necessary for the published dataset as we intend to only publish the updated records, but due dilligence demands we check.

  length(unique(base$ImportParcelID)) # Number of unique ImportParcelIDs
  dim(base)[1] # Number of rows in the base dataset

  if (length(unique(base$ImportParcelID)) != dim(base)[1]) {
    # Example: Print all entries for parcels with at least two records.
    print(base[ImportParcelID %in% base[duplicated(ImportParcelID), ImportParcelID], ][order(ImportParcelID)])

    setkeyv(base, c("ImportParcelID", "LoadID")) # Sets the index and also orders by ImportParcelID, then LoadID increasing
    keepRows <- base[, .I[.N], by = c("ImportParcelID")] # Creates a table where the 1st column is ImportParcelID and the second column
    # gives the row number of the last row that ImportParcelID appears.
    base <- base[keepRows[[2]], ] # Keeps only those rows identified in previous step
  }
  return(base)
}


readBldg <- function(vintageDirectory, rows2load, state) {
  layoutZAsmt <- read_excel(file.path(vintageDirectory, "Layout.xlsx"), sheet = 1)
  col_namesBldg <- layoutZAsmt[layoutZAsmt$TableName == "utBuilding", "FieldName"]
  bldg <- fread(file.path(vintageDirectory, state, "ZAsmt/Building.txt"),
    nrows = rows2load, # this is set just to test it out. Remove when code runs smoothly.
    sep = "|",
    header = FALSE,
    stringsAsFactors = FALSE,
    # skipNul = TRUE, # tells R to treat two ajacent delimiters as dividing a column
    # comment.char = "", # tells R not to read any symbol as a comment
    quote = "", # this tells R not to read quotation marks as a special symbol
    col.names = col_namesBldg$FieldName
  )
  # bldg <- as.data.table(bldg)
  bldg <- bldg[, list(
    RowID, NoOfUnits, BuildingOrImprovementNumber,
    YearBuilt, EffectiveYearBuilt, YearRemodeled,
    NoOfStories, StoryTypeStndCode, TotalRooms, TotalBedrooms,
    FullBath, ThreeQuarterBath, HalfBath, QuarterBath, TotalCalculatedBathCount, TotalActualBathCount,
    RoofCoverStndCode, RoofStructureTypeStndCode,
    HeatingTypeorSystemStndCode,
    PropertyLandUseStndCode, BuildingClassStndCode, BuildingQualityStndCode, BuildingQualityStndCodeOriginal
  )]
  #  Reduce bldg dataset to Residential Properties
  bldg <- bldg[
    PropertyLandUseStndCode %in% c(
      "RR000", # RESIDENTIAL GENERAL
      "RR101", # SINGLE FAMILY RESIDENTIAL
      "RR102", # RURAL RESIDENCE
      "RR103", # MOBILE HOME
      "RR104", # TOWNHOUSE
      "RR105", # CLUSTER HOME
      "RR106", # CONDOMINIUM
      "RR107", # COOPERATIVE
      "RR108", # ROW HOUSE
      "RR109", # PLANNED UNIT DEVELOPMENT
      "RR110", # RESIDENTIAL COMMON AREA
      "RR111", # TIMESHARE
      "RR112", # SEASONAL, CABIN, VACATION RESIDENCE
      "RR113", # BUNGALOW
      "RR114", # ZERO LOT LINE
      "RR115", # MANUFACTURED, MODULAR, PREFABRICATED HOMES
      "RR116", # PATIO HOME
      "RR117", # RESIDENTIAL PARKING GARAGE
      "RR118", # MISCELLANEOUS IMPROVEMENT
      "RR119", # GARDEN HOME
      "RR120", # LANDOMINIUM
      "RR999", # INFERRED SINGLE FAMILY RESIDENTIAL
      "RI000", # RESIDENTIAL INCOME GENERAL (MULTI FAMILY)
      "RI101", # DUPLEX (2 UNITS, ANY COMBINATION)
      "RI102", # TRIPLEX (3 UNITS, ANY COMBINATION)
      "RI103", # QUADRUPLEX (4 UNITS, ANY COMBINATION)
      "RI104", # APARTMENT BUILDING (5+ UNITS)
      "RI105", # APARTMENT BUILDING (100+ UNITS)
      "RI106", # GARDEN APARTMENT, COURT APARTMENT (5+ UNITS)
      "RI107", # HIGH-RISE APARTMENT
      "RI108", # BOARDING HOUSE ROOMING HOUSE APT HOTEL TRANSIENT LODGING
      "RI109", # MOBILE HOME PARK, TRAILER PARK
      "RI110", # MULTIFAMILY DWELLING (GENERIC ANY COMBINATION 2+)
      "RI111", # FRATERNITY HOUSE, SORORITY HOUSE
      "RI112", # APARTMENT (GENERIC)
      "RI113", # DORMITORY, GROUP QUARTERS (RESIDENTIAL)
      "RI114"
    ), # RESIDENTIAL CONDOMINIUM DEVELOPMENT (ASSOCIATION ASSESSMENT)
  ]
  return(bldg)
}


readAreas <- function(vintageDirectory, rows2load, state) {
  layoutZAsmt <- read_excel(file.path(vintageDirectory, "Layout.xlsx"), sheet = 1)
  col_namesBldgA <- layoutZAsmt[layoutZAsmt$TableName == "utBuildingAreas", "FieldName"]
  sqft <- fread(file.path(vintageDirectory, state, "/ZAsmt/BuildingAreas.txt"),
    nrows = rows2load, # this is set just to test it out. Remove when code runs smoothly.
    sep = "|",
    header = FALSE,
    stringsAsFactors = FALSE,
    # skipNul = TRUE, # tells R to treat two ajacent delimiters as dividing a column
    # comment.char = "", # tells R not to read any symbol as a comment
    quote = "", # this tells R not to read quotation marks as a special symbol
    col.names = col_namesBldgA$FieldName
  )
  # sqft <- as.data.table(sqft)
  # Counties report different breakdowns of building square footage and/or call similar concepts by different names.
  # The structure of this table is to keep all entries reported by the county as they are given. See 'Bldg Area' table in documentation.
  # The goal of this code is to determine the total square footage of each property.
  # We assume a simple logic to apply across all counties here. Different logic may be as or more valid.
  # The logic which generates square footage reported on our sites is more complex, sometimes county specific, and often influenced by user interaction and update.
  sqft <- sqft[
    BuildingAreaStndCode %in% c(
      "BAL", # Building Area Living
      "BAF", # Building Area Finished
      "BAE", # Effective Building Area
      "BAG", # Gross Building Area
      "BAJ", # Building Area Adjusted
      "BAT", # Building Area Total
      "BLF"
    ), # Building Area Finished Living
  ]
  table(sqft$BuildingOrImprovementNumber) # BuildingOrImprovementNumber > 1  refers to additional buildings on the parcel.
  sqft <- sqft[, list(sqfeet = max(BuildingAreaSqFt, na.rm = T)), by = c("RowID", "BuildingOrImprovementNumber")]
  return(sqft)
}



readValue <- function(vintageDirectory, rows2load, state) {
  layoutZAsmt <- read_excel(file.path(vintageDirectory, "Layout.xlsx"), sheet = 1)
  col_namesValue <- layoutZAsmt[layoutZAsmt$TableName == "utValue", "FieldName"]
  val <- fread(file.path(vintageDirectory, state, "ZAsmt/Value.txt"),
    nrows = rows2load, # this is set just to test it out. Remove when code runs smoothly.
    sep = "|",
    header = FALSE,
    stringsAsFactors = FALSE,
    # skipNul = TRUE, # tells R to treat two ajacent delimiters as dividing a column
    # comment.char = "", # tells R not to read any symbol as a comment
    quote = "", # this tells R not to read quotation marks as a special symbol
    col.names = col_namesValue$FieldName
  )
  # val <- as.data.table(val)
  val <- val[, list(
    RowID,
    TotalAssessedValue, ImprovementAssessedValue, ImprovementMarketValue, ImprovementAppraisalValue,
    AssessmentYear, MarketValueYear, AppraisalValueYear
  )]
  return(val)
}

prototyping <- FALSE

if (prototyping) {
  rows2load <- 100000
} else {
  rows2load <- -1
}

ptm <- proc.time()


datafolder2016 <- file.path(ZTRAX, "2016")
datafolder2018 <- file.path(ZTRAX, "2018")
datafolder2020 <- file.path(ZTRAX, "20201012")

file_locations <- data.frame(
  year = c(2016, 2018, 2020),
  fileloc = c(datafolder2016, datafolder2018, datafolder2020)
)

fips_and_abbr <- data.frame(
  fips = c("04", "06", "08", "35", "41", "53"),
  stateabbr = c("AZ", "CA", "CO", "NM", "OR", "WA")
)

load_data <- function(statefips, deliveryyear) {
  print(paste(statefips, deliveryyear, sep = " "))
  datafolder <- file_locations$fileloc[file_locations$year == deliveryyear]
  print("reading main.txt")
  basedata <- readMain(datafolder, rows2load, statefips)
  print("reading building.txt")
  bldgdata <- readBldg(datafolder, rows2load, statefips)
  print("reading buildingareas.txt")
  sqftdata <- readAreas(datafolder, rows2load, statefips)
  print("reading value.txt")
  valdata <- readValue(datafolder, rows2load, statefips)

  # Merge together and save an R Data file
  print("merging")
  attrdata <- merge(basedata, bldgdata, by = "RowID")
  attrdata <- merge(attrdata, sqftdata, by = c("RowID", "BuildingOrImprovementNumber"), all.x = TRUE)
  attrdata <- merge(attrdata, valdata, by = "RowID", all.x = TRUE)
  colSums(is.na(attrdata))
  print(table(attrdata$ExtractDate, useNA = "always"))
  print("saving")
  stateabbr <- fips_and_abbr$stateabbr[fips_and_abbr$fips == statefips]
  write_fst(attrdata, file.path(WORKING, "ztraxdata", paste0(stateabbr, "attr", deliveryyear, ".fst")))
}

### -- Not California
ptm <- proc.time()
load_data("04", 2016)
load_data("08", 2016)
load_data("08", 2020)
load_data("41", 2020)
load_data("53", 2020)
proc.time() - ptm

### -- California
ptm <- proc.time()
load_data("06", 2016)
load_data("06", 2018)
load_data("06", 2020)
proc.time() - ptm
